{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training a model with `traffic_last_5min` feature\n",
    "\n",
    "\n",
    "## Introduction\n",
    "\n",
    "In this notebook, we'll train a taxifare prediction model but this time with an additional feature of `traffic_last_5min`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Nny3m465gKkY"
   },
   "outputs": [],
   "source": [
    "!sudo chown -R jupyter:jupyter /home/jupyter/training-data-analyst"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "import os\n",
    "import shutil\n",
    "\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "\n",
    "from matplotlib import pyplot as plt\n",
    "from tensorflow import keras\n",
    "\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Dense, DenseFeatures\n",
    "from tensorflow.keras.callbacks import TensorBoard\n",
    "\n",
    "print(tf.__version__)\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PROJECT = 'cloud-training-demos' # REPLACE WITH YOUR PROJECT ID\n",
    "BUCKET = 'cloud-training-demos' # REPLACE WITH YOUR BUCKET NAME\n",
    "REGION = 'us-central1' # REPLACE WITH YOUR BUCKET REGION e.g. us-central1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For Bash Code\n",
    "os.environ['PROJECT'] = PROJECT\n",
    "os.environ['BUCKET'] = BUCKET\n",
    "os.environ['REGION'] = REGION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "gcloud config set project $PROJECT\n",
    "gcloud config set compute/region $REGION"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load raw data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -l ../data/taxi-traffic*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head ../data/taxi-traffic*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Use tf.data to read the CSV files\n",
    "\n",
    "These functions for reading data from the csv files are similar to what we used in the Introduction to Tensorflow module. Note that here we have an addtional feature `traffic_last_5min`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "CSV_COLUMNS = [\n",
    "    'fare_amount',\n",
    "    'dayofweek',\n",
    "    'hourofday',\n",
    "    'pickup_longitude',\n",
    "    'pickup_latitude',\n",
    "    'dropoff_longitude',\n",
    "    'dropoff_latitude',\n",
    "    'traffic_last_5min'\n",
    "]\n",
    "LABEL_COLUMN = 'fare_amount'\n",
    "DEFAULTS = [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]\n",
    "\n",
    "\n",
    "def features_and_labels(row_data):\n",
    "    label = row_data.pop(LABEL_COLUMN)\n",
    "    features = row_data\n",
    "\n",
    "    return features, label\n",
    "\n",
    "\n",
    "def create_dataset(pattern, batch_size=1, mode=tf.estimator.ModeKeys.EVAL):\n",
    "    dataset = tf.data.experimental.make_csv_dataset(\n",
    "        pattern, batch_size, CSV_COLUMNS, DEFAULTS)\n",
    "\n",
    "    dataset = dataset.map(features_and_labels)\n",
    "\n",
    "    if mode == tf.estimator.ModeKeys.TRAIN:\n",
    "        dataset = dataset.shuffle(buffer_size=1000).repeat()\n",
    "\n",
    "    # take advantage of multi-threading; 1=AUTOTUNE\n",
    "    dataset = dataset.prefetch(1)\n",
    "    return dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "INPUT_COLS = [\n",
    "    'dayofweek',\n",
    "    'hourofday',\n",
    "    'pickup_longitude',\n",
    "    'pickup_latitude',\n",
    "    'dropoff_longitude',\n",
    "    'dropoff_latitude',\n",
    "    'traffic_last_5min'\n",
    "]\n",
    "\n",
    "# Create input layer of feature columns\n",
    "feature_columns = {\n",
    "    colname: tf.feature_column.numeric_column(colname)\n",
    "    for colname in INPUT_COLS\n",
    "    }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build a simple keras DNN model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build a keras DNN model using Sequential API\n",
    "def build_model(dnn_hidden_units):\n",
    "    model = Sequential(DenseFeatures(feature_columns=feature_columns.values()))\n",
    "    \n",
    "    for num_nodes in dnn_hidden_units:\n",
    "        model.add(Dense(units=num_nodes, activation=\"relu\"))\n",
    "    \n",
    "    model.add(Dense(units=1, activation=\"linear\"))    \n",
    "    \n",
    "    # Create a custom evalution metric\n",
    "    def rmse(y_true, y_pred):\n",
    "        return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))\n",
    "\n",
    "    # Compile the keras model\n",
    "    model.compile(optimizer=\"adam\", loss=\"mse\", metrics=[rmse, \"mse\"])\n",
    "    \n",
    "    return model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we can call the `build_model` to create the model. Here we'll have two hidden layers before our final output layer. And we'll train with the same parameters we used before."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "HIDDEN_UNITS = [32, 8]\n",
    "\n",
    "model = build_model(dnn_hidden_units=HIDDEN_UNITS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "BATCH_SIZE = 1000\n",
    "NUM_TRAIN_EXAMPLES = 10000 * 6  # training dataset will repeat, wrap around\n",
    "NUM_EVALS = 60  # how many times to evaluate\n",
    "NUM_EVAL_EXAMPLES = 10000  # enough to get a reasonable sample\n",
    "\n",
    "trainds = create_dataset(\n",
    "    pattern='../data/taxi-traffic-train*',\n",
    "    batch_size=BATCH_SIZE,\n",
    "    mode=tf.estimator.ModeKeys.TRAIN)\n",
    "\n",
    "evalds = create_dataset(\n",
    "    pattern='../data/taxi-traffic-valid*',\n",
    "    batch_size=BATCH_SIZE,\n",
    "    mode=tf.estimator.ModeKeys.EVAL).take(NUM_EVAL_EXAMPLES//1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "steps_per_epoch = NUM_TRAIN_EXAMPLES // (BATCH_SIZE * NUM_EVALS)\n",
    "\n",
    "LOGDIR = \"./taxi_trained\"\n",
    "history = model.fit(x=trainds,\n",
    "                    steps_per_epoch=steps_per_epoch,\n",
    "                    epochs=NUM_EVALS,\n",
    "                    validation_data=evalds,\n",
    "                    callbacks=[TensorBoard(LOGDIR)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "RMSE_COLS = ['rmse', 'val_rmse']\n",
    "\n",
    "pd.DataFrame(history.history)[RMSE_COLS].plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.predict(x={\"dayofweek\": tf.convert_to_tensor([6]),\n",
    "                 \"hourofday\": tf.convert_to_tensor([17]),\n",
    "                 \"pickup_longitude\": tf.convert_to_tensor([-73.982683]),\n",
    "                 \"pickup_latitude\": tf.convert_to_tensor([40.742104]),\n",
    "                 \"dropoff_longitude\": tf.convert_to_tensor([-73.983766]),\n",
    "                 \"dropoff_latitude\": tf.convert_to_tensor([40.755174]),\n",
    "                \"traffic_last_5min\": tf.convert_to_tensor([114])},\n",
    "              steps=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Export and deploy model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "OUTPUT_DIR = \"./export/savedmodel\"\n",
    "shutil.rmtree(OUTPUT_DIR, ignore_errors=True)\n",
    "EXPORT_PATH = os.path.join(OUTPUT_DIR,\n",
    "                           datetime.datetime.now().strftime(\"%Y%m%d%H%M%S\"))\n",
    "tf.saved_model.save(model, EXPORT_PATH)  # with default serving function\n",
    "os.environ['EXPORT_PATH'] = EXPORT_PATH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "PROJECT=${PROJECT}\n",
    "BUCKET=${BUCKET}\n",
    "REGION=${REGION}\n",
    "MODEL_NAME=taxifare\n",
    "VERSION_NAME=traffic\n",
    "\n",
    "if [[ $(gcloud ai-platform models list --format='value(name)' --region=$REGION | grep \"^$MODEL_NAME$\") ]]; then\n",
    "    echo \"$MODEL_NAME already exists\"\n",
    "else\n",
    "    # create model\n",
    "    echo \"Creating $MODEL_NAME\"\n",
    "    gcloud ai-platform models create --region=$REGION $MODEL_NAME\n",
    "fi\n",
    "\n",
    "if [[ $(gcloud ai-platform versions list --model $MODEL_NAME --format='value(name)' --region=$REGION | grep \"^$VERSION_NAME$\") ]]; then\n",
    "    echo \"Deleting already existing $MODEL_NAME:$VERSION_NAME ... \"\n",
    "    gcloud ai-platform versions delete --model=$MODEL_NAME $VERSION_NAME --region=$REGION\n",
    "    echo \"Please run this cell again if you don't see a Creating message ... \"\n",
    "    sleep 2\n",
    "fi\n",
    "\n",
    "# create model\n",
    "echo \"Creating $MODEL_NAME:$VERSION_NAME\"\n",
    "gcloud ai-platform versions create --model=$MODEL_NAME $VERSION_NAME --async \\\n",
    "       --framework=tensorflow --python-version=3.5 --runtime-version=1.14 \\\n",
    "       --origin=${EXPORT_PATH} --staging-bucket=gs://$BUCKET --region=$REGION"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright 2019 Google Inc. Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License"
   ]
  }
 ],
 "metadata": {
  "environment": {
   "name": "tf2-gpu.2-1.m69",
   "type": "gcloud",
   "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-1:m69"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
